I use the spaero::get_stats() function to calculate EWS over a moving window for the measles case incidence data from four cities in Niger. I use a bandwidth of 35 weeks, resulting in a 70 week window. All EWS are calculated with backward_only = TRUE so that EWS are only calculated based on data in the past.
# Load packages -----------------------------------------------------------
library(tidyverse)
library(spaero)
# Load data ---------------------------------------------------------------
fname <- "../../data/clean-data/weekly-measles-incidence-niger-cities-clean.RDS"
measles_data <- readRDS(fname) %>%
filter(year > 1994) # drop first NA year, only used for modeling
# Calculate EWS -----------------------------------------------------------
all_stats <- tibble() # empty tibble for storage
for(do_region in unique(measles_data$region)){
cases <- measles_data %>%
filter(region == do_region) %>%
pull(cases)
city_stats <- spaero::get_stats(
x = cases,
center_trend = "local_constant",
center_kernel = "uniform",
center_bandwidth = 35,
stat_trend = "local_constant",
stat_kernel = "uniform",
stat_bandwidth = 35,
lag = 1,
backward_only = TRUE
)$stats
city_stats_tb <- as_tibble(city_stats) %>%
mutate(
time_iter = 1:n(),
date = unique(measles_data$date)
) %>%
gather(key = ews, value = value, -time_iter, -date) %>%
mutate(region = do_region)
all_stats <- bind_rows(all_stats, city_stats_tb)
}
Define a function to plot the EWS and the observations.
plot_it <- function(dates, cases, ews, lab, title){
par(mar = c(5,5,2,5))
plot(dates, cases, type = "l", main = title)
par(new = T)
plot(dates, ews, type = "l", col = "red", axes=F, xlab=NA, ylab=NA)
axis(side = 4)
mtext(side = 4, line = 3, lab, cex = 0.7)
}
do_region <- "Agadez (City)"
dates <- unique(measles_data$date)
par(mfrow = c(4,3))
for(do_ews in unique(all_stats$ews)){
cases <- filter(measles_data, region == do_region) %>% pull(cases)
ews <- filter(all_stats, region == do_region & ews == do_ews) %>% pull(value)
plot_it(dates, cases, ews, lab = "ews", title = do_ews)
}
do_region <- "Maradi (City)"
dates <- unique(measles_data$date)
par(mfrow = c(4,3))
for(do_ews in unique(all_stats$ews)){
cases <- filter(measles_data, region == do_region) %>% pull(cases)
ews <- filter(all_stats, region == do_region & ews == do_ews) %>% pull(value)
plot_it(dates, cases, ews, lab = "ews", title = do_ews)
}
do_region <- "Niamey (City)"
dates <- unique(measles_data$date)
par(mfrow = c(4,3))
for(do_ews in unique(all_stats$ews)){
cases <- filter(measles_data, region == do_region) %>% pull(cases)
ews <- filter(all_stats, region == do_region & ews == do_ews) %>% pull(value)
plot_it(dates, cases, ews, lab = "ews", title = do_ews)
}
do_region <- "Zinder (City)"
dates <- unique(measles_data$date)
par(mfrow = c(4,3))
for(do_ews in unique(all_stats$ews)){
cases <- filter(measles_data, region == do_region) %>% pull(cases)
ews <- filter(all_stats, region == do_region & ews == do_ews) %>% pull(value)
plot_it(dates, cases, ews, lab = "ews", title = do_ews)
}
As the plots above show, most EWS get swamped out by the “memory” of outbreaks. To try and get around this, I am going to look at EWS after resetting them following outbreaks. I don’t think this is too contrived because real-world early detection systems would have to do the same thing.
Following Toby’s suggestion, I am going to look at the distribution of outbreak sizes (i.e., cumulative cases in each year). I can then fit a mixture distribution of two weighted exponentials to the data, with one exponential explaining the large outbreaks and one the small outbreaks. This provides an objective way to define outbreaks for EWS detection.
pop_data <- tibble(
region = c("Agadez (City)", "Maradi (City)", "Niamey (City)", "Zinder (City)"),
pop = c(118224, 267249, 1027000, 322935)
)
cusum_data <- measles_data %>%
group_by(region, year) %>%
summarise(cases = sum(cases)) %>%
left_join(pop_data)
Joining, by = "region"
ggplot(cusum_data, aes(x = cases/pop)) +
geom_histogram(aes(fill = region))
cusum_cases <- sort(cusum_data$cases / cusum_data$pop)
fit1 <- MASS::fitdistr(cusum_cases, "exponential")
x <- cusum_cases
mixexplik <- function(p,lambda1,lambda2) {
z <- p*dexp(x,lambda1) + (1-p)*dexp(x,lambda2)
return(-sum(log(z)))
}
mle_fit <- suppressWarnings( # ignore warnings about bad guesses
stats4::mle(minuslogl = mixexplik,
start= list(p = 0.2,
lambda1 = as.numeric(fit1$estimate),
lambda2 = as.numeric(fit1$estimate)),
method="Nelder-Mead")
)
get_prob <- function(x, rate1, rate2, p, x_scale, bin_width = 1){
p1 <- pexp((x+bin_width)/x_scale, rate1) - pexp((x-bin_width)/x_scale, rate1)
p2 <- pexp((x+bin_width)/x_scale, rate2) - pexp((x-bin_width)/x_scale, rate2)
mixp <- p*p1+(1-p)*p2
return(c(p1,p2,mixp))
}
x_scale <- 1e05
xstar <- round(x*x_scale)
probs <- sapply(xstar, FUN = get_prob, rate1 = mle_fit@coef["lambda1"],
rate2 = mle_fit@coef["lambda2"], p = mle_fit@coef["p"],
x_scale = x_scale, bin_width = 1)
bound <- qexp(p = 0.5, rate = mle_fit@coef["p"]*mle_fit@coef["lambda1"])
probs_df <- t(log10(probs)) %>%
as_tibble() %>%
dplyr::rename(
"log10prob_large" = V2,
"log10prob_small" = V1,
"log10prob_mix" = p
) %>%
dplyr::mutate(
incidence = x,
outbreak_size = ifelse(incidence < bound, "small", "large"),
plot_prob = ifelse(outbreak_size == "small", log10prob_small, log10prob_large)
)
ggplot(probs_df, aes(x = incidence)) +
geom_line(aes(y = log10prob_mix, color = "forestgreen")) +
geom_point(aes(y = plot_prob, fill = outbreak_size), size = 3, shape = 21, color = "white") +
geom_vline(aes(xintercept = bound), linetype = 2, color = "grey35") +
labs(x = "Case incidence (reports/person)",
y = expression(paste(Log[10]," probability"))) +
scale_fill_discrete(name = NULL, labels = c("Large outbreaks", "Small outbreaks")) +
scale_color_manual(name = NULL, labels = "GMM fit", values = "forestgreen") +
theme_minimal(base_size = 12)
OK, so now I can identify outbreak years in the untransformed (just cases) data for each city.
outbreak_ids <- cusum_data %>%
dplyr::mutate(
incidence = cases/pop,
outbreak_year = ifelse(incidence < bound, FALSE, TRUE)
) %>%
dplyr::select(region, year, outbreak_year)
measles_data <- measles_data %>%
left_join(outbreak_ids)
Joining, by = c("region", "year")
ggplot(measles_data, aes(x = date, y = cases, color = outbreak_year, group = year)) +
geom_line() +
facet_wrap(~region, scales = "free")
Let’s look at a test case of Niamey post 1995 leading up to the outbreak in 1999.
niamey_test <- measles_data %>%
filter(region == "Niamey (City)") %>%
filter(year > 1998 & year < 2000)
cases <- niamey_test$cases
variance <- spaero::get_stats(
x = cases,
center_trend = "local_constant",
center_kernel = "uniform",
center_bandwidth = 16,
stat_trend = "local_constant",
stat_kernel = "uniform",
stat_bandwidth = 16,
lag = 1,
backward_only = TRUE
)$stats$variance
plot_it(dates = niamey_test$date, cases = cases, ews = log(variance), lab = "log(variance)", title = NULL)
niamey_test <- measles_data %>%
filter(region == "Niamey (City)")
out_vars <- tibble()
for(do_year in unique(niamey_test$year)){
tmp_cases <- niamey_test %>%
filter(year == do_year) %>%
pull(cases)
tmp_stats <- spaero::get_stats(
x = tmp_cases,
center_trend = "local_constant",
center_kernel = "uniform",
center_bandwidth = 16,
stat_trend = "local_constant",
stat_kernel = "uniform",
stat_bandwidth = 16,
lag = 1,
backward_only = TRUE
)$stats
tmp_var <- tibble(
year = do_year,
date = filter(niamey_test, year == do_year)$date,
variance = tmp_stats$variance,
outbreak_year = filter(niamey_test, year == do_year)$outbreak_year
)
out_vars <- bind_rows(out_vars, tmp_var)
}
max_case_week <- niamey_test %>%
group_by(year) %>%
filter(cases == max(cases)) %>%
filter(week_of_year == min(week_of_year))
ggplot() +
geom_line(data = out_vars, aes(x = lubridate::week(date), y = log(variance),
group = year, color = outbreak_year)) +
geom_vline(data = max_case_week, aes(xintercept = week_of_year, color = outbreak_year), linetype = 2)
Hmmm, this seems promising. Perhaps I can fit a GAM through the trajectories and use model selection to determine if an interaction model (outbreak year*time interaction effect) fits better than a GAM with no interaction. Then can use Gavin’s methods for determining time points that are different. Model-based minimum time of difference would be the on-average indicator time, which could be compared to max cases within the year. Should do this with data (like above) and the simulations.
gam_data <- out_vars %>%
dplyr::select(date, variance, outbreak_year) %>%
dplyr::mutate(
week_of_year = lubridate::week(date),
log_variance = log(variance+0.00001),
outbreak_year = as.factor(outbreak_year)
) %>%
dplyr::select(-date)
test_formula <- as.formula(
log_variance ~ outbreak_year + s(week_of_year, by = outbreak_year)
)
gam_model <- mgcv::gam(formula = test_formula, data = gam_data)
pred_df <- tibble(
week_of_year = rep(1:52, times = 2),
outbreak_year = rep(as.factor(c(TRUE, FALSE)), each = 52)
)
preds <- predict(gam_model, newdata = pred_df, type = "response", se.fit = TRUE)
ests <- as.numeric(preds[[1]])
ses <- as.numeric(preds[[2]])
pred_df <- pred_df %>%
mutate(preds = ests,
se = ses)
ggplot(pred_df, aes(x = week_of_year, y = preds, color = outbreak_year)) +
geom_line() +
geom_line(aes(y = preds+(se*2)), linetype = 2) +
geom_line(aes(y = preds-(se*2)), linetype = 2)
ggplot(niamey_test, aes(x = week_of_year, y = cases, color = outbreak_year, group = year)) +
geom_line()
gam_data <- niamey_test %>%
dplyr::select(date, cases, outbreak_year) %>%
dplyr::mutate(
week_of_year = lubridate::week(date),
log_cases = log(cases+0.0001),
outbreak_year = as.factor(outbreak_year)
) %>%
dplyr::select(-date)
test_formula <- as.formula(
log_cases ~ outbreak_year + s(week_of_year, by = outbreak_year)
)
gam_model <- mgcv::gam(formula = test_formula, data = gam_data)
pred_df <- tibble(
week_of_year = rep(1:52, times = 2),
outbreak_year = rep(as.factor(c(TRUE, FALSE)), each = 52)
)
preds <- predict(gam_model, newdata = pred_df, type = "response", se.fit = TRUE)
ests <- as.numeric(preds[[1]])
ses <- as.numeric(preds[[2]])
pred_df <- pred_df %>%
mutate(preds = ests,
se = ses)
ggplot(pred_df, aes(x = week_of_year, y = preds, color = outbreak_year)) +
geom_line() +
geom_line(aes(y = preds+(se*2)), linetype = 2) +
geom_line(aes(y = preds-(se*2)), linetype = 2)
Maybe there is a variance value that serves as an indicator? Look at high AUCs from the simulations to see what the variances are.
ews <- read_csv("../../results/ews-emergence.csv") %>%
filter(city == "Maradi" & metric == "variance" & half == "second")
Parsed with column specification:
cols(
metric = col_character(),
sim = col_integer(),
value = col_double(),
half = col_character(),
city = col_character(),
susc_discount = col_double()
)
|======== | 6%
|========= | 6%
|========= | 7% 1 MB
|========== | 8% 1 MB
|=========== | 8% 1 MB
|============ | 9% 1 MB
|============= | 10% 1 MB
|============== | 11% 1 MB
|=============== | 11% 1 MB
|=============== | 12% 1 MB
|================ | 13% 1 MB
|================= | 13% 1 MB
|================== | 14% 1 MB
|=================== | 15% 2 MB
|==================== | 15% 2 MB
|===================== | 16% 2 MB
|====================== | 17% 2 MB
|====================== | 17% 2 MB
|======================= | 18% 2 MB
|======================== | 19% 2 MB
|========================= | 20% 2 MB
|========================== | 20% 2 MB
|=========================== | 21% 2 MB
|============================ | 22% 2 MB
|============================= | 22% 3 MB
|============================== | 23% 3 MB
|=============================== | 24% 3 MB
|================================ | 25% 3 MB
|================================ | 25% 3 MB
|================================= | 26% 3 MB
|================================== | 27% 3 MB
|=================================== | 27% 3 MB
|==================================== | 28% 3 MB
|===================================== | 29% 3 MB
|====================================== | 29% 4 MB
|======================================= | 30% 4 MB
|======================================= | 31% 4 MB
|======================================== | 31% 4 MB
|========================================= | 32% 4 MB
|========================================== | 33% 4 MB
|=========================================== | 34% 4 MB
|============================================ | 34% 4 MB
|============================================= | 35% 4 MB
|============================================== | 36% 4 MB
|=============================================== | 36% 4 MB
|=============================================== | 37% 5 MB
|================================================ | 38% 5 MB
|================================================= | 38% 5 MB
|================================================== | 39% 5 MB
|=================================================== | 40% 5 MB
|==================================================== | 40% 5 MB
|===================================================== | 41% 5 MB
|====================================================== | 42% 5 MB
|======================================================= | 42% 5 MB
|======================================================= | 43% 5 MB
|======================================================== | 44% 5 MB
|========================================================= | 45% 6 MB
|========================================================== | 45% 6 MB
|=========================================================== | 46% 6 MB
|============================================================ | 47% 6 MB
|============================================================= | 47% 6 MB
|============================================================== | 48% 6 MB
|=============================================================== | 49% 6 MB
|================================================================ | 50% 6 MB
|================================================================ | 50% 6 MB
|================================================================= | 51% 6 MB
|================================================================== | 52% 7 MB
|=================================================================== | 52% 7 MB
|==================================================================== | 53% 7 MB
|===================================================================== | 54% 7 MB
|====================================================================== | 54% 7 MB
|======================================================================= | 55% 7 MB
|======================================================================= | 56% 7 MB
|======================================================================== | 56% 7 MB
|========================================================================= | 57% 7 MB
|========================================================================== | 58% 7 MB
|=========================================================================== | 58% 7 MB
|============================================================================ | 59% 8 MB
|============================================================================= | 60% 8 MB
|============================================================================== | 61% 8 MB
|=============================================================================== | 61% 8 MB
|=============================================================================== | 62% 8 MB
|================================================================================ | 63% 8 MB
|================================================================================= | 63% 8 MB
|================================================================================== | 64% 8 MB
|=================================================================================== | 65% 8 MB
|==================================================================================== | 65% 8 MB
|===================================================================================== | 66% 8 MB
|====================================================================================== | 67% 9 MB
|====================================================================================== | 67% 9 MB
|======================================================================================= | 68% 9 MB
|======================================================================================== | 69% 9 MB
|========================================================================================= | 70% 9 MB
|========================================================================================== | 70% 9 MB
|=========================================================================================== | 71% 9 MB
|============================================================================================ | 72% 9 MB
|============================================================================================= | 72% 9 MB
|============================================================================================== | 73% 9 MB
|=============================================================================================== | 74% 10 MB
|================================================================================================ | 75% 10 MB
|================================================================================================ | 75% 10 MB
|================================================================================================= | 76% 10 MB
|================================================================================================== | 77% 10 MB
|=================================================================================================== | 77% 10 MB
|==================================================================================================== | 78% 10 MB
|===================================================================================================== | 79% 10 MB
|====================================================================================================== | 79% 10 MB
|======================================================================================================= | 80% 10 MB
|======================================================================================================= | 81% 10 MB
|======================================================================================================== | 81% 11 MB
|========================================================================================================= | 82% 11 MB
|========================================================================================================== | 83% 11 MB
|=========================================================================================================== | 84% 11 MB
|============================================================================================================ | 84% 11 MB
|============================================================================================================= | 85% 11 MB
|============================================================================================================== | 86% 11 MB
|=============================================================================================================== | 86% 11 MB
|=============================================================================================================== | 87% 11 MB
|================================================================================================================ | 88% 11 MB
|================================================================================================================= | 88% 12 MB
|================================================================================================================== | 89% 12 MB
|=================================================================================================================== | 90% 12 MB
|==================================================================================================================== | 90% 12 MB
|===================================================================================================================== | 91% 12 MB
|====================================================================================================================== | 92% 12 MB
|====================================================================================================================== | 92% 12 MB
|======================================================================================================================= | 93% 12 MB
|======================================================================================================================== | 94% 12 MB
|========================================================================================================================= | 95% 12 MB
|========================================================================================================================== | 95% 12 MB
|=========================================================================================================================== | 96% 13 MB
|============================================================================================================================ | 97% 13 MB
|============================================================================================================================= | 97% 13 MB
|============================================================================================================================== | 98% 13 MB
|===============================================================================================================================| 99% 13 MB
|================================================================================================================================| 100% 13 MB
|================================================================================================================================| 100% 13 MB
ggplot(ews, aes(x = value)) +
geom_histogram() +
facet_wrap(~susc_discount, scales = "free")
med_var <- ews %>%
group_by(susc_discount) %>%
summarise(value = median(value))